import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# !pip --disable-pip-version-check install dataprep # Please use it for the first time if it is not installed in your environment
from dataprep.eda import create_report, plot, plot_correlation, plot_missing
df_train = pd.read_csv("../input/loan-eligible-dataset/loan-train.csv")
df_train.head()
| Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LP001002 | Male | No | 0 | Graduate | No | 5849 | 0.0 | NaN | 360.0 | 1.0 | Urban | Y |
| 1 | LP001003 | Male | Yes | 1 | Graduate | No | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | Rural | N |
| 2 | LP001005 | Male | Yes | 0 | Graduate | Yes | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | Urban | Y |
| 3 | LP001006 | Male | Yes | 0 | Not Graduate | No | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | Urban | Y |
| 4 | LP001008 | Male | No | 0 | Graduate | No | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | Urban | Y |
df_test = pd.read_csv("../input/loan-eligible-dataset/loan-test.csv")
df_test.head()
| Loan_ID | Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LP001015 | Male | Yes | 0 | Graduate | No | 5720 | 0 | 110.0 | 360.0 | 1.0 | Urban |
| 1 | LP001022 | Male | Yes | 1 | Graduate | No | 3076 | 1500 | 126.0 | 360.0 | 1.0 | Urban |
| 2 | LP001031 | Male | Yes | 2 | Graduate | No | 5000 | 1800 | 208.0 | 360.0 | 1.0 | Urban |
| 3 | LP001035 | Male | Yes | 2 | Graduate | No | 2340 | 2546 | 100.0 | 360.0 | NaN | Urban |
| 4 | LP001051 | Male | No | 0 | Not Graduate | No | 3276 | 0 | 78.0 | 360.0 | 1.0 | Urban |
df_train.shape
(614, 13)
df_test.shape
(367, 12)
plot(df_train)
| Number of Variables | 13 |
|---|---|
| Number of Rows | 614 |
| Missing Cells | 149 |
| Missing Cells (%) | 1.9% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 316.6 KB |
| Average Row Size in Memory | 528.0 B |
| Variable Types |
|
| Gender has 13 (2.12%) missing values | Missing |
|---|---|
| Dependents has 15 (2.44%) missing values | Missing |
| Self_Employed has 32 (5.21%) missing values | Missing |
| LoanAmount has 22 (3.58%) missing values | Missing |
| Loan_Amount_Term has 14 (2.28%) missing values | Missing |
| Credit_History has 50 (8.14%) missing values | Missing |
| ApplicantIncome is skewed | Skewed |
| CoapplicantIncome is skewed | Skewed |
| LoanAmount is skewed | Skewed |
| Loan_Amount_Term is skewed | Skewed |
| Loan_ID has a high cardinality: 614 distinct values | High Cardinality |
|---|---|
| Loan_ID has constant length 8 | Constant Length |
| Credit_History has constant length 3 | Constant Length |
| Loan_Status has constant length 1 | Constant Length |
| Loan_ID has all distinct values | Unique |
| CoapplicantIncome has 273 (44.46%) zeros | Zeros |
create_report(df_train)
| Number of Variables | 13 |
|---|---|
| Number of Rows | 614 |
| Missing Cells | 149 |
| Missing Cells (%) | 1.9% |
| Duplicate Rows | 0 |
| Duplicate Rows (%) | 0.0% |
| Total Size in Memory | 316.6 KB |
| Average Row Size in Memory | 528.0 B |
| Variable Types |
|
| Gender has 13 (2.12%) missing values | Missing |
|---|---|
| Dependents has 15 (2.44%) missing values | Missing |
| Self_Employed has 32 (5.21%) missing values | Missing |
| LoanAmount has 22 (3.58%) missing values | Missing |
| Loan_Amount_Term has 14 (2.28%) missing values | Missing |
| Credit_History has 50 (8.14%) missing values | Missing |
| ApplicantIncome is skewed | Skewed |
| CoapplicantIncome is skewed | Skewed |
| LoanAmount is skewed | Skewed |
| Loan_Amount_Term is skewed | Skewed |
| Loan_ID has a high cardinality: 614 distinct values | High Cardinality |
|---|---|
| Loan_ID has constant length 8 | Constant Length |
| Credit_History has constant length 3 | Constant Length |
| Loan_Status has constant length 1 | Constant Length |
| Loan_ID has all distinct values | Unique |
| CoapplicantIncome has 273 (44.46%) zeros | Zeros |
categorical
| Approximate Distinct Count | 614 |
|---|---|
| Approximate Unique (%) | 100.0% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 43.8 KB |
| Mean | 8 |
|---|---|
| Standard Deviation | 0 |
| Median | 8 |
| Minimum | 8 |
| Maximum | 8 |
| 1st row | LP001002 |
|---|---|
| 2nd row | LP001003 |
| 3rd row | LP001005 |
| 4th row | LP001006 |
| 5th row | LP001008 |
| Count | 1228 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 1228 |
| Dash Punctuation | 0 |
| Decimal Number | 3684 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.3% |
| Missing | 13 |
| Missing (%) | 2.1% |
| Memory Size | 40.7 KB |
| Mean | 4.3727 |
|---|---|
| Standard Deviation | 0.7794 |
| Median | 4 |
| Minimum | 4 |
| Maximum | 6 |
| 1st row | Male |
|---|---|
| 2nd row | Male |
| 3rd row | Male |
| 4th row | Male |
| 5th row | Male |
| Count | 2628 |
|---|---|
| Lowercase Letter | 2027 |
| Space Separator | 0 |
| Uppercase Letter | 601 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.3% |
| Missing | 3 |
| Missing (%) | 0.5% |
| Memory Size | 40.4 KB |
| Mean | 2.6514 |
|---|---|
| Standard Deviation | 0.4769 |
| Median | 3 |
| Minimum | 2 |
| Maximum | 3 |
| 1st row | No |
|---|---|
| 2nd row | Yes |
| 3rd row | Yes |
| 4th row | Yes |
| 5th row | No |
| Count | 1620 |
|---|---|
| Lowercase Letter | 1009 |
| Space Separator | 0 |
| Uppercase Letter | 611 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
categorical
| Approximate Distinct Count | 4 |
|---|---|
| Approximate Unique (%) | 0.7% |
| Missing | 15 |
| Missing (%) | 2.4% |
| Memory Size | 38.7 KB |
| Mean | 1.0851 |
|---|---|
| Standard Deviation | 0.2793 |
| Median | 1 |
| Minimum | 1 |
| Maximum | 2 |
| 1st row | 0 |
|---|---|
| 2nd row | 1 |
| 3rd row | 0 |
| 4th row | 0 |
| 5th row | 0 |
| Count | 0 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 0 |
| Dash Punctuation | 0 |
| Decimal Number | 599 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 44.3 KB |
| Mean | 8.873 |
|---|---|
| Standard Deviation | 1.6536 |
| Median | 8 |
| Minimum | 8 |
| Maximum | 12 |
| 1st row | Graduate |
|---|---|
| 2nd row | Graduate |
| 3rd row | Graduate |
| 4th row | Not Graduate |
| 5th row | Graduate |
| Count | 5314 |
|---|---|
| Lowercase Letter | 4566 |
| Space Separator | 134 |
| Uppercase Letter | 748 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.3% |
| Missing | 32 |
| Missing (%) | 5.2% |
| Memory Size | 38.2 KB |
| Mean | 2.1409 |
|---|---|
| Standard Deviation | 0.3482 |
| Median | 2 |
| Minimum | 2 |
| Maximum | 3 |
| 1st row | No |
|---|---|
| 2nd row | No |
| 3rd row | Yes |
| 4th row | No |
| 5th row | No |
| Count | 1246 |
|---|---|
| Lowercase Letter | 664 |
| Space Separator | 0 |
| Uppercase Letter | 582 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
numerical
| Approximate Distinct Count | 505 |
|---|---|
| Approximate Unique (%) | 82.2% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 9.6 KB |
| Mean | 5403.4593 |
| Minimum | 150 |
| Maximum | 81000 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 150 |
|---|---|
| 5-th Percentile | 1897.55 |
| Q1 | 2877.5 |
| Median | 3812.5 |
| Q3 | 5795 |
| 95-th Percentile | 14583 |
| Maximum | 81000 |
| Range | 80850 |
| IQR | 2917.5 |
| Mean | 5403.4593 |
|---|---|
| Standard Deviation | 6109.0417 |
| Variance | 3.732e+07 |
| Sum | 3.3177e+06 |
| Skewness | 6.5235 |
| Kurtosis | 60.039 |
| Coefficient of Variation | 1.1306 |
numerical
| Approximate Distinct Count | 287 |
|---|---|
| Approximate Unique (%) | 46.7% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 9.6 KB |
| Mean | 1621.2458 |
| Minimum | 0 |
| Maximum | 41667 |
| Zeros | 273 |
| Zeros (%) | 44.5% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 0 |
|---|---|
| 5-th Percentile | 0 |
| Q1 | 0 |
| Median | 1188.5 |
| Q3 | 2297.25 |
| 95-th Percentile | 4997.4 |
| Maximum | 41667 |
| Range | 41667 |
| IQR | 2297.25 |
| Mean | 1621.2458 |
|---|---|
| Standard Deviation | 2926.2484 |
| Variance | 8.5629e+06 |
| Sum | 995444.92 |
| Skewness | 7.4732 |
| Kurtosis | 84.2564 |
| Coefficient of Variation | 1.8049 |
numerical
| Approximate Distinct Count | 203 |
|---|---|
| Approximate Unique (%) | 34.3% |
| Missing | 22 |
| Missing (%) | 3.6% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 9.2 KB |
| Mean | 146.4122 |
| Minimum | 9 |
| Maximum | 700 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 9 |
|---|---|
| 5-th Percentile | 56 |
| Q1 | 100 |
| Median | 128 |
| Q3 | 168 |
| 95-th Percentile | 297.8 |
| Maximum | 700 |
| Range | 691 |
| IQR | 68 |
| Mean | 146.4122 |
|---|---|
| Standard Deviation | 85.5873 |
| Variance | 7325.1902 |
| Sum | 86676 |
| Skewness | 2.6708 |
| Kurtosis | 10.3038 |
| Coefficient of Variation | 0.5846 |
numerical
| Approximate Distinct Count | 10 |
|---|---|
| Approximate Unique (%) | 1.7% |
| Missing | 14 |
| Missing (%) | 2.3% |
| Infinite | 0 |
| Infinite (%) | 0.0% |
| Memory Size | 9.4 KB |
| Mean | 342 |
| Minimum | 12 |
| Maximum | 480 |
| Zeros | 0 |
| Zeros (%) | 0.0% |
| Negatives | 0 |
| Negatives (%) | 0.0% |
| Minimum | 12 |
|---|---|
| 5-th Percentile | 180 |
| Q1 | 360 |
| Median | 360 |
| Q3 | 360 |
| 95-th Percentile | 360 |
| Maximum | 480 |
| Range | 468 |
| IQR | 0 |
| Mean | 342 |
|---|---|
| Standard Deviation | 65.1204 |
| Variance | 4240.6678 |
| Sum | 205200 |
| Skewness | -2.3565 |
| Kurtosis | 6.608 |
| Coefficient of Variation | 0.1904 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.4% |
| Missing | 50 |
| Missing (%) | 8.1% |
| Memory Size | 37.5 KB |
| Mean | 3 |
|---|---|
| Standard Deviation | 0 |
| Median | 3 |
| Minimum | 3 |
| Maximum | 3 |
| 1st row | 1.0 |
|---|---|
| 2nd row | 1.0 |
| 3rd row | 1.0 |
| 4th row | 1.0 |
| 5th row | 1.0 |
| Count | 0 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 0 |
| Dash Punctuation | 0 |
| Decimal Number | 1128 |
categorical
| Approximate Distinct Count | 3 |
|---|---|
| Approximate Unique (%) | 0.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 42.9 KB |
| Mean | 6.5179 |
|---|---|
| Standard Deviation | 1.9426 |
| Median | 5 |
| Minimum | 5 |
| Maximum | 9 |
| 1st row | Urban |
|---|---|
| 2nd row | Rural |
| 3rd row | Urban |
| 4th row | Urban |
| 5th row | Urban |
| Count | 4002 |
|---|---|
| Lowercase Letter | 3388 |
| Space Separator | 0 |
| Uppercase Letter | 614 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
categorical
| Approximate Distinct Count | 2 |
|---|---|
| Approximate Unique (%) | 0.3% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 39.6 KB |
| Mean | 1 |
|---|---|
| Standard Deviation | 0 |
| Median | 1 |
| Minimum | 1 |
| Maximum | 1 |
| 1st row | Y |
|---|---|
| 2nd row | N |
| 3rd row | Y |
| 4th row | Y |
| 5th row | Y |
| Count | 614 |
|---|---|
| Lowercase Letter | 0 |
| Space Separator | 0 |
| Uppercase Letter | 614 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
plot(df_train, "Property_Area")
| Approximate Distinct Count | 3 |
|---|---|
| Approximate Unique (%) | 0.5% |
| Missing | 0 |
| Missing (%) | 0.0% |
| Memory Size | 42.9 KB |
| Mean | 6.5179 |
|---|---|
| Standard Deviation | 1.9426 |
| Median | 5 |
| Minimum | 5 |
| Maximum | 9 |
| 1st row | Urban |
|---|---|
| 2nd row | Rural |
| 3rd row | Urban |
| 4th row | Urban |
| 5th row | Urban |
| Count | 4002 |
|---|---|
| Lowercase Letter | 3388 |
| Space Separator | 0 |
| Uppercase Letter | 614 |
| Dash Punctuation | 0 |
| Decimal Number | 0 |
| Value | Count | Frequency (%) |
| Semiurban | 233 | |
| Urban | 202 | |
| Rural | 179 |